library(tidyverse)
library(ggrepel) # Mejora la visualización de texto en ggplots evitando
library(gprofiler2) # enriquecimiento grofiler
library(clusterProfiler) # para enriquecimientos
library(enrichplot) # plots de enriquecimiento
library(DOSE) # needed to convert to enrichResult object
library(reactable) # para tablas interactivas
important.genes.umap1pos <- readRDS("important.genes.umap1pos.RDS")
load("mi_entorno_EnrDif.RData")

1 UMAP

umap.data <- data.frame(umap.ad$layout)[,1:2]
colnames(umap.data) <- c("umap1", "umap2")

covs2_df <- covs2.casos %>%
  dplyr::select(mrna_id, sampleset_UMAP)

para.plot <- merge(umap.data, covs2_df, by = "row.names")
rownames(para.plot) <- para.plot$Row.names
para.plot$Row.names <- NULL

ggplot(para.plot, aes_string(x = "umap1", y = "umap2", color = "sampleset_UMAP")) +
    geom_point(show.legend = TRUE, size = 3) +
    geom_text_repel(aes(label=ifelse(mrna_id %in% rownames(outliers.umap1) | mrna_id %in% rownames(outliers.umap2), as.character(gsub("_.*", "", covs2$mrna_id)), "")),
                  color = "black",
                  max.overlaps = 30, # Reduce el número máximo de solapamientos
                  point.padding = unit(0.2, "lines"), # Menos padding alrededor de los puntos
                  size = 3, 
                  fontface = "bold",
                  segment.size = 0.2, # Líneas de guía más finas
                  segment.color = 'grey50',
                  max.segment.length = unit(0.5, "lines"), # Líneas de guía más cortas
                  arrow = arrow(length = unit(0.02, "npc"), type = "closed", ends = "last")) +
    theme_minimal() +
    labs(title = "UMAP",
         x = "umap1", y = "umap2") +
    theme(
      legend.title = element_blank(),
      legend.text = element_text(size = 12),
      text = element_text(size = 12),
      legend.key.size = unit(0.5, "cm")) +
  geom_point(data= (covs2 %>% filter(neuroStatus == 0)), aes_string(x = "umap1", y = "umap2", color = "sampleset_UMAP"), alpha = 0.5) 

1.1 Seleccion genes importantes

estado <- rep(0, nrow(mat_exp_alz_genes))

estado[rownames(mat_exp_alz_genes) %in% mUMAP1.pos] <- 1

# Añadir la nueva columna a la matriz de expresión de genes
mat_exp_alz_genes.RF <- data.frame(cbind(mat_exp_alz_genes, class = estado))

mat_exp_alz_genes.RF$class <- as.factor(mat_exp_alz_genes.RF$class)

mat_exp_alz_genes.RF <- mat_exp_alz_genes.RF[rownames(covs2.casos),]
library(caret)
library(randomForest)
library(doParallel)
num_cores <- detectCores() - 2
registerDoParallel(cores=num_cores)
modelLookup("ranger")

Aquí he utilizado el parámetro importance = “permutation”

# Definir una función para calcular la Accuracy balanceada
balanced_accuracy <- function(data, lev = NULL, model = NULL) {
  cm <- confusionMatrix(data$pred, data$obs)
  sensitivity <- cm$byClass["Sensitivity"]
  specificity <- cm$byClass["Specificity"]
  balanced_acc <- mean(c(sensitivity, specificity))
  c(BalancedAccuracy = balanced_acc)
}

# Definir las métricas de evaluación que queremos utilizar
RFControl <- trainControl(
  method = "cv", 
  number = 10,
  repeats = 10,
  summaryFunction = balanced_accuracy,
  allowParallel = TRUE,
  seeds = NULL, 
  returnResamp = "final",
)

set.seed(1234)
sqr.genes <- round(sqrt(ncol(mat_exp_alz_genes.RF)))

mygrid <- expand.grid(mtry = c(sqr.genes - 2, sqr.genes - 1, sqr.genes, sqr.genes+1, sqr.genes+2),
                      splitrule = c("gini", "extratrees"),
                      min.node.size = 1)
trees <- seq(500, 2000, 500) 

rf.cv.10 <- list() # lista vacia para meter los resultados de cada bucle
for (tree in trees){ # bucle en cada pasada hace un modelo con un número de arboles distintos
  modelo <- train(class ~., data=mat_exp_alz_genes.RF,
                  method="ranger",
                  tuneGrid=mygrid,
                  trControl=RFControl, 
                  ntree = tree,
                  metric = "BalancedAccuracy",
                  importance = "permutation" # o impurity permutation https://arxiv.org/abs/1407.7502
  )
  rf.cv.10[[paste(tree, "trees")]] <- modelo # metemos el modelo en la lista
}
# Hacemos dataframe con las métricas de cada modelo
datos_combinados <- rbind(
  data.frame(mtry = rf.cv.10$`500 trees`$results$mtry, splitrule = rf.cv.10$`500 trees`$results$splitrule, MetricValue = rf.cv.10$`500 trees`$results$BalancedAccuracy, Trees = "500 trees", Metric = "Balanced Accuracy"),
  data.frame(mtry = rf.cv.10$`1000 trees`$results$mtry, splitrule = rf.cv.10$`500 trees`$results$splitrule, MetricValue = rf.cv.10$`1000 trees`$results$BalancedAccuracy, Trees = "1000 trees", Metric = "Balanced Accuracy"),
  data.frame(mtry = rf.cv.10$`1500 trees`$results$mtry, splitrule = rf.cv.10$`500 trees`$results$splitrule, MetricValue = rf.cv.10$`1500 trees`$results$BalancedAccuracy, Trees = "1500 trees", Metric = "Balanced Accuracy"),
  data.frame(mtry = rf.cv.10$`2000 trees`$results$mtry, splitrule = rf.cv.10$`500 trees`$results$splitrule, MetricValue = rf.cv.10$`2000 trees`$results$BalancedAccuracy, Trees = "2000 trees", Metric = "Balanced Accuracy")
)

datos_combinados$Trees <- factor(datos_combinados$Trees, levels = c("500 trees", "1000 trees", "1500 trees", "2000 trees"))

datos_combinados %>%
  # filter(splitrule == "extratrees") %>%
  filter(Metric == "Balanced Accuracy") %>%
  ggplot(aes(x = mtry, y = MetricValue, color = Trees)) +
  geom_point(size = 4) +
  geom_line(linetype = "dashed") +
  theme_minimal() +
  labs(x = "Mtry", y = "Valor de la Métrica", title = "Comparación de Balanced Accuracy con splitrule") +
  facet_wrap(~ splitrule) 

models <- resamples(rf.cv.10)

summary(models)
## 
## Call:
## summary.resamples(object = models)
## 
## Models: 500 trees, 1000 trees, 1500 trees, 2000 trees 
## Number of resamples: 10 
## 
## BalancedAccuracy 
##            Min. 1st Qu.    Median      Mean   3rd Qu. Max. NA's
## 500 trees   0.5  0.6875 0.8333333 0.7916667 0.9583333    1    0
## 1000 trees  0.5  0.6875 0.7916667 0.7833333 0.8333333    1    0
## 1500 trees  0.5  0.6875 0.7916667 0.7833333 0.9583333    1    0
## 2000 trees  0.5  0.7500 0.7916667 0.8000000 0.8333333    1    0

1.2 Particón infra y sobreexpresados

medianas_por_clase <- aggregate(. ~ class, data = mat_exp_alz_genes.RF, FUN = median)

medianas_clase_0 <- medianas_por_clase[medianas_por_clase$class == 0, ]
medianas_clase_1 <- medianas_por_clase[medianas_por_clase$class == 1, ]

genes_upregulados <- colnames(mat_exp_alz_genes.RF)[sapply(colnames(mat_exp_alz_genes.RF), function(gene) {
  if (gene != "class") {
    return(medianas_clase_1[[gene]] > medianas_clase_0[[gene]])
  } else {
    return(FALSE)
  }
})]

genes_downregulados <- colnames(mat_exp_alz_genes.RF)[sapply(colnames(mat_exp_alz_genes.RF), function(gene) {
  if (gene != "class") {
    return(medianas_clase_1[[gene]] < medianas_clase_0[[gene]])
  } else {
    return(FALSE)
  }
})]
important.genes.umap1pos <- varImp(rf.cv.10$`2000 trees`)$importance %>% 
  rownames_to_column() %>%
  arrange(desc(Overall))

reactable(important.genes.umap1pos)
important.genes.umap1pos %>%
  filter(rowname %in% genes_upregulados) %>%
  head(50) %>%
  mutate(row_index = row_number(),  # Crear una columna numérica para el índice
         rowname = forcats::fct_inorder(rowname)) %>%
  ggplot(aes(x = row_index, y = Overall)) +  # Usar row_index en el eje X
    geom_point() +
    geom_line() +
    labs(x = "Número Genes", title = "Importancia genes sobreexpresados en UMAP1+ en RF") +
    theme_bw() +
    theme(axis.text.x = element_text(size = 10))

important.genes.umap1pos %>%
  filter(rowname %in% genes_downregulados) %>%
  head(50) %>%
  mutate(row_index = row_number(),  # Crear una columna numérica para el índice
         rowname = forcats::fct_inorder(rowname)) %>%
  ggplot(aes(x = row_index, y = Overall)) +  # Usar row_index en el eje X
    geom_point() +
    geom_line() +
    labs(x = "Número Genes", title = "Importancia genes infraexpresados en UMAP1+ en RF") +
    theme_bw() +
    theme(axis.text.x = element_text(size = 10))

2 Top 10

important.genes.top <- important.genes.umap1pos%>%
  head(., 10) 

selected_columns <- colnames(mat_exp_alz_genes.RF) %in% c(important.genes.top$rowname, "class")

df <- mat_exp_alz_genes.RF[, selected_columns]

df_long <- df %>%
  gather(key = "Gene", value = "Expression", -class)

df_long$Gene <- factor(df_long$Gene, levels = important.genes.top$rowname)

ggplot(df_long, aes(x = class, y = Expression, fill = class)) +
  geom_boxplot() +
  facet_wrap(~ Gene, scales = "free_y") +
  labs(title = "Boxplots Genes", x = "UMAP1pos", y = "Expression", fill = "mUMAP1pos") 

2.1 GO

matriz.AD.minus.important.genes <- mat_exp_alz_genes.RF[,!colnames(mat_exp_alz_genes.RF) %in% important.genes.top$rowname]

gp_up <- gost(list("Specific UMAP1+ genes" = important.genes.top$rowname, 
                          "Matriz genes (No UMAP1+ genes)" = colnames(matriz.AD.minus.important.genes)) , 
                     organism = "hsapiens",
                     sources = "GO", 
                     multi_query = T,
                     highlight = T,
                     exclude_iea = T)

gostplot(gp_up, interactive = T, capped = T)
top_genes_umap1pos.gprofiler = gconvert(important.genes.top$rowname)
top_genes_matriz.gprofiler = gconvert(colnames(matriz.AD.minus.important.genes))

# enrichment analysis using gene names
multi_gp = gost(list("Specific UMAP1+ genes" = top_genes_umap1pos.gprofiler$name,
                     "Matriz genes (No UMAP1+ genes)" = top_genes_matriz.gprofiler$name), 
          multi_query = FALSE, 
          evcodes = TRUE, 
          sources = "GO", 
          highlight = T,
          exclude_iea = T
          )

multi_gp$result <- multi_gp$result %>%
  filter(highlighted == T)

# modify the g:Profiler data frame
gp_mod = multi_gp$result[,c("query", "source", "term_id",
                                "term_name", "p_value", "query_size", 
                                "intersection_size", "term_size", 
                                "effective_domain_size", "intersection")]
gp_mod$GeneRatio = paste0(gp_mod$intersection_size,  "/", gp_mod$query_size)
gp_mod$BgRatio = paste0(gp_mod$term_size, "/", gp_mod$effective_domain_size)
names(gp_mod) = c("Cluster", "Category", "ID", "Description", "p.adjust", 
                    "query_size", "Count", "term_size", "effective_domain_size", 
                    "geneID", "GeneRatio", "BgRatio")
gp_mod$geneID = gsub(",", "/", gp_mod$geneID)

# define as enrichResult object
gp_mod_enrich  = new("enrichResult", result = gp_mod)

gp_mod_enrich@result <- gp_mod_enrich@result %>%
  mutate(generationumerico = Count / query_size) %>%
  arrange(desc(Cluster)) 

enrichplot::dotplot(gp_mod_enrich, showCategory = 20, x = "Cluster", size = "GeneRatio", title = "GO 10 genes")

write.csv(gp_mod_enrich@result, "top10_genes_umap1pos_GO.csv")

2.2 KEGG

gp_up_ordered = gost(list("Specific UMAP1+ genes" = important.genes.top$rowname, 
                          "Matriz genes (No UMAP1+ genes)" = colnames(matriz.AD.minus.important.genes)) , 
                     organism = "hsapiens",
                     sources = "KEGG", 
                     multi_query = T)

gostplot(gp_up_ordered, interactive = T, capped = T)
top_genes_umap1pos.gprofiler = gconvert(important.genes.top$rowname)
top_genes_matriz.gprofiler = gconvert(colnames(matriz.AD.minus.important.genes))

# enrichment analysis using gene names
multi_gp = gost(list("Specific UMAP1+ genes" = top_genes_umap1pos.gprofiler$name,
                     "Matriz genes (No UMAP1+ genes)" = top_genes_matriz.gprofiler$name), 
          multi_query = FALSE, 
          evcodes = TRUE, 
          sources = "KEGG")

# modify the g:Profiler data frame
gp_mod = multi_gp$result[,c("query", "source", "term_id",
                                "term_name", "p_value", "query_size", 
                                "intersection_size", "term_size", 
                                "effective_domain_size", "intersection")]
gp_mod$GeneRatio = paste0(gp_mod$intersection_size,  "/", gp_mod$query_size)
gp_mod$BgRatio = paste0(gp_mod$term_size, "/", gp_mod$effective_domain_size)
names(gp_mod) = c("Cluster", "Category", "ID", "Description", "p.adjust", 
                    "query_size", "Count", "term_size", "effective_domain_size", 
                    "geneID", "GeneRatio", "BgRatio")
gp_mod$geneID = gsub(",", "/", gp_mod$geneID)

# define as enrichResult object
gp_mod_enrich  = new("enrichResult", result = gp_mod)

gp_mod_enrich@result <- gp_mod_enrich@result %>%
  mutate(generationumerico = Count / query_size) %>%
  arrange(desc(generationumerico)) 

enrichplot::dotplot(gp_mod_enrich, showCategory = 30, x = "Cluster", size = "GeneRatio", title = "KEGG 10 genes")

write.csv(gp_mod_enrich@result, "top10_genes_umap1pos_KEGG.csv")

2.3 REAC

gp_up_ordered = gost(list("Specific UMAP1+ genes" = important.genes.top$rowname, 
                          "Matriz genes (No UMAP1+ genes)" = colnames(matriz.AD.minus.important.genes)) , 
                     organism = "hsapiens",
                     sources = "REAC", 
                     multi_query = T)

gostplot(gp_up_ordered, interactive = T, capped = T)
top_genes_umap1pos.gprofiler = gconvert(important.genes.top$rowname)
top_genes_matriz.gprofiler = gconvert(colnames(matriz.AD.minus.important.genes))

# enrichment analysis using gene names
multi_gp = gost(list("Specific UMAP1+ genes" = top_genes_umap1pos.gprofiler$name,
                     "Matriz genes (No UMAP1+ genes)" = top_genes_matriz.gprofiler$name), 
          multi_query = FALSE, 
          evcodes = TRUE, 
          sources = "REAC")

# modify the g:Profiler data frame
gp_mod = multi_gp$result[,c("query", "source", "term_id",
                                "term_name", "p_value", "query_size", 
                                "intersection_size", "term_size", 
                                "effective_domain_size", "intersection")]
gp_mod$GeneRatio = paste0(gp_mod$intersection_size,  "/", gp_mod$query_size)
gp_mod$BgRatio = paste0(gp_mod$term_size, "/", gp_mod$effective_domain_size)
names(gp_mod) = c("Cluster", "Category", "ID", "Description", "p.adjust", 
                    "query_size", "Count", "term_size", "effective_domain_size", 
                    "geneID", "GeneRatio", "BgRatio")
gp_mod$geneID = gsub(",", "/", gp_mod$geneID)

# define as enrichResult object
gp_mod_enrich  = new("enrichResult", result = gp_mod)

gp_mod_enrich@result <- gp_mod_enrich@result %>%
  mutate(generationumerico = Count / query_size) %>%
  arrange(desc(Cluster)) 

enrichplot::dotplot(gp_mod_enrich, showCategory = 20, x = "Cluster", size = "GeneRatio", title = "REAC 10 genes")

write.csv(gp_mod_enrich@result, "top10_genes_umap1pos_REAC.csv")

3 Top 20

3.1 GO

important.genes.top <- important.genes.umap1pos%>%
  head(., 20) 

selected_columns <- colnames(mat_exp_alz_genes.RF) %in% c(important.genes.top$rowname, "class")

df <- mat_exp_alz_genes.RF[, selected_columns]

df_long <- df %>%
  gather(key = "Gene", value = "Expression", -class)

df_long$Gene <- factor(df_long$Gene, levels = important.genes.top$rowname)

ggplot(df_long, aes(x = class, y = Expression, fill = class)) +
  geom_boxplot() +
  facet_wrap(~ Gene, scales = "free_y") +
  labs(title = "Boxplots Genes", x = "UMAP1pos", y = "Expression", fill = "mUMAP1pos") 

matriz.AD.minus.important.genes <- mat_exp_alz_genes.RF[,!colnames(mat_exp_alz_genes.RF) %in% important.genes.top$rowname]

top_genes <- important.genes.top$rowname

gp_up_ordered = gost(list("Specific UMAP1+ genes" = important.genes.top$rowname, 
                          "Matriz genes (No UMAP1+ genes)" = colnames(matriz.AD.minus.important.genes)) , 
                     organism = "hsapiens",
                     sources = "GO", 
                     multi_query = T,
                     highlight = T,
                     exclude_iea = T)

gostplot(gp_up_ordered, interactive = T, capped = T)
top_genes_umap1pos.gprofiler = gconvert(important.genes.top$rowname)
top_genes_matriz.gprofiler = gconvert(colnames(matriz.AD.minus.important.genes))

# enrichment analysis using gene names
multi_gp = gost(list("Specific UMAP1+ genes" = top_genes_umap1pos.gprofiler$name,
                     "Matriz genes (No UMAP1+ genes)" = top_genes_matriz.gprofiler$name), 
          multi_query = FALSE, 
          evcodes = TRUE, 
          sources = "GO", 
          highlight = T,
                     exclude_iea = T)

multi_gp$result <- multi_gp$result %>%
  filter(highlighted == T)

# modify the g:Profiler data frame
gp_mod = multi_gp$result[,c("query", "source", "term_id",
                                "term_name", "p_value", "query_size", 
                                "intersection_size", "term_size", 
                                "effective_domain_size", "intersection")]
gp_mod$GeneRatio = paste0(gp_mod$intersection_size,  "/", gp_mod$query_size)
gp_mod$BgRatio = paste0(gp_mod$term_size, "/", gp_mod$effective_domain_size)
names(gp_mod) = c("Cluster", "Category", "ID", "Description", "p.adjust", 
                    "query_size", "Count", "term_size", "effective_domain_size", 
                    "geneID", "GeneRatio", "BgRatio")
gp_mod$geneID = gsub(",", "/", gp_mod$geneID)

# define as enrichResult object
gp_mod_enrich  = new("enrichResult", result = gp_mod)

gp_mod_enrich@result <- gp_mod_enrich@result %>%
  mutate(generationumerico = Count / query_size) %>%
  arrange(desc(Cluster)) 

enrichplot::dotplot(gp_mod_enrich, showCategory = 20, x = "Cluster", size = "GeneRatio", title = "GO 20 genes")

write.csv(gp_mod_enrich@result, "top20_genes_umap1pos_go.csv")

3.2 KEGG

gp_up_ordered = gost(list("Specific UMAP1+ genes" = important.genes.top$rowname, 
                          "Matriz genes (No UMAP1+ genes)" = colnames(matriz.AD.minus.important.genes)) , 
                     organism = "hsapiens",
                     sources = "KEGG", 
                     multi_query = T,
                     highlight = T)

gostplot(gp_up_ordered, interactive = T, capped = T)
top_genes_umap1pos.gprofiler = gconvert(important.genes.top$rowname)
top_genes_matriz.gprofiler = gconvert(colnames(matriz.AD.minus.important.genes))

# enrichment analysis using gene names
multi_gp = gost(list("Specific UMAP1+ genes" = top_genes_umap1pos.gprofiler$name,
                     "Matriz genes (No UMAP1+ genes)" = top_genes_matriz.gprofiler$name), 
          multi_query = FALSE, 
          evcodes = TRUE, 
          sources = "KEGG")

# modify the g:Profiler data frame
gp_mod = multi_gp$result[,c("query", "source", "term_id",
                                "term_name", "p_value", "query_size", 
                                "intersection_size", "term_size", 
                                "effective_domain_size", "intersection")]
gp_mod$GeneRatio = paste0(gp_mod$intersection_size,  "/", gp_mod$query_size)
gp_mod$BgRatio = paste0(gp_mod$term_size, "/", gp_mod$effective_domain_size)
names(gp_mod) = c("Cluster", "Category", "ID", "Description", "p.adjust", 
                    "query_size", "Count", "term_size", "effective_domain_size", 
                    "geneID", "GeneRatio", "BgRatio")
gp_mod$geneID = gsub(",", "/", gp_mod$geneID)

# define as enrichResult object
gp_mod_enrich  = new("enrichResult", result = gp_mod)

gp_mod_enrich@result <- gp_mod_enrich@result %>%
  mutate(generationumerico = Count / query_size) %>%
  arrange(desc(generationumerico)) 

enrichplot::dotplot(gp_mod_enrich, showCategory = 30, x = "Cluster", size = "GeneRatio", title = "KEGG 20 genes")

write.csv(gp_mod_enrich@result, "top20_genes_umap1pos_kegg.csv")

3.3 REAC

gp_up_ordered = gost(list("Specific UMAP1+ genes" = important.genes.top$rowname, 
                          "Matriz genes (No UMAP1+ genes)" = colnames(matriz.AD.minus.important.genes)) , 
                     organism = "hsapiens",
                     sources = "REAC", 
                     multi_query = T,
                     highlight = T)

gostplot(gp_up_ordered, interactive = T, capped = T)
top_genes_umap1pos.gprofiler = gconvert(important.genes.top$rowname)
top_genes_matriz.gprofiler = gconvert(colnames(matriz.AD.minus.important.genes))

# enrichment analysis using gene names
multi_gp = gost(list("Specific UMAP1+ genes" = top_genes_umap1pos.gprofiler$name,
                     "Matriz genes (No UMAP1+ genes)" = top_genes_matriz.gprofiler$name), 
          multi_query = FALSE, 
          evcodes = TRUE, 
          sources = "REAC")

# modify the g:Profiler data frame
gp_mod = multi_gp$result[,c("query", "source", "term_id",
                                "term_name", "p_value", "query_size", 
                                "intersection_size", "term_size", 
                                "effective_domain_size", "intersection")]
gp_mod$GeneRatio = paste0(gp_mod$intersection_size,  "/", gp_mod$query_size)
gp_mod$BgRatio = paste0(gp_mod$term_size, "/", gp_mod$effective_domain_size)
names(gp_mod) = c("Cluster", "Category", "ID", "Description", "p.adjust", 
                    "query_size", "Count", "term_size", "effective_domain_size", 
                    "geneID", "GeneRatio", "BgRatio")
gp_mod$geneID = gsub(",", "/", gp_mod$geneID)

# define as enrichResult object
gp_mod_enrich  = new("enrichResult", result = gp_mod)

gp_mod_enrich@result <- gp_mod_enrich@result %>%
  mutate(generationumerico = Count / query_size) %>%
  arrange(desc(Cluster)) 

enrichplot::dotplot(gp_mod_enrich, showCategory = 20, x = "Cluster", size = "GeneRatio", title = "REAC 20 genes")

write.csv(gp_mod_enrich@result, "top20_genes_umap1pos_reac.csv")

4 Top 30

4.1 GO

important.genes.top <- important.genes.umap1pos%>%
  head(., 30) 

selected_columns <- colnames(mat_exp_alz_genes.RF) %in% c(important.genes.top$rowname, "class")

df <- mat_exp_alz_genes.RF[, selected_columns]

df_long <- df %>%
  gather(key = "Gene", value = "Expression", -class)

df_long$Gene <- factor(df_long$Gene, levels = important.genes.top$rowname)

ggplot(df_long, aes(x = class, y = Expression, fill = class)) +
  geom_boxplot() +
  facet_wrap(~ Gene, scales = "free_y") +
  labs(title = "Boxplots Genes", x = "UMAP1pos", y = "Expression", fill = "mUMAP1pos") 

matriz.AD.minus.important.genes <- mat_exp_alz_genes.RF[,!colnames(mat_exp_alz_genes.RF) %in% important.genes.top$rowname]

top_genes <- important.genes.top$rowname

gp_up_ordered = gost(list("Specific UMAP1+ genes" = important.genes.top$rowname, 
                          "Matriz genes (No UMAP1+ genes)" = colnames(matriz.AD.minus.important.genes)) , 
                     organism = "hsapiens",
                     sources = "GO", 
                     multi_query = T,
                     highlight = T,
                     exclude_iea = T)

gostplot(gp_up_ordered, interactive = T, capped = T)
top_genes_umap1pos.gprofiler = gconvert(important.genes.top$rowname[important.genes.top$rowname %in% genes_upregulados])
top_genes_matriz.gprofiler = gconvert(colnames(matriz.AD.minus.important.genes)[colnames(matriz.AD.minus.important.genes) %in% genes_upregulados])

# enrichment analysis using gene names
multi_gp = gost(list("Specific UMAP1+ genes" = top_genes_umap1pos.gprofiler$name,
                     "Matriz genes (No UMAP1+ genes)" = top_genes_matriz.gprofiler$name), 
          multi_query = FALSE, 
          evcodes = TRUE, 
          sources = "GO", 
          highlight = T,
                     exclude_iea = T)

multi_gp$result <- multi_gp$result %>%
  filter(highlighted == T)

# modify the g:Profiler data frame
gp_mod = multi_gp$result[,c("query", "source", "term_id",
                                "term_name", "p_value", "query_size", 
                                "intersection_size", "term_size", 
                                "effective_domain_size", "intersection")]
gp_mod$GeneRatio = paste0(gp_mod$intersection_size,  "/", gp_mod$query_size)
gp_mod$BgRatio = paste0(gp_mod$term_size, "/", gp_mod$effective_domain_size)
names(gp_mod) = c("Cluster", "Category", "ID", "Description", "p.adjust", 
                    "query_size", "Count", "term_size", "effective_domain_size", 
                    "geneID", "GeneRatio", "BgRatio")
gp_mod$geneID = gsub(",", "/", gp_mod$geneID)

# define as enrichResult object
gp_mod_enrich  = new("enrichResult", result = gp_mod)

gp_mod_enrich@result <- gp_mod_enrich@result %>%
  mutate(generationumerico = Count / query_size) %>%
  arrange(desc(Cluster)) 

enrichplot::dotplot(gp_mod_enrich, showCategory = 20, x = "Cluster", size = "GeneRatio", title = "GO 30 genes upregulated")

write.csv(gp_mod_enrich@result, "top30_genes_umap1pos_go.csv")
top_genes_umap1pos.gprofiler = gconvert(important.genes.top$rowname[important.genes.top$rowname %in% genes_downregulados])
top_genes_matriz.gprofiler = gconvert(colnames(matriz.AD.minus.important.genes)[colnames(matriz.AD.minus.important.genes) %in% genes_downregulados])

# enrichment analysis using gene names
multi_gp = gost(list("Specific UMAP1+ genes" = top_genes_umap1pos.gprofiler$name,
                     "Matriz genes (No UMAP1+ genes)" = top_genes_matriz.gprofiler$name), 
          multi_query = FALSE, 
          evcodes = TRUE, 
          sources = "GO", 
          highlight = T,
                     exclude_iea = T)

multi_gp$result <- multi_gp$result %>%
  filter(highlighted == T)

# modify the g:Profiler data frame
gp_mod = multi_gp$result[,c("query", "source", "term_id",
                                "term_name", "p_value", "query_size", 
                                "intersection_size", "term_size", 
                                "effective_domain_size", "intersection")]
gp_mod$GeneRatio = paste0(gp_mod$intersection_size,  "/", gp_mod$query_size)
gp_mod$BgRatio = paste0(gp_mod$term_size, "/", gp_mod$effective_domain_size)
names(gp_mod) = c("Cluster", "Category", "ID", "Description", "p.adjust", 
                    "query_size", "Count", "term_size", "effective_domain_size", 
                    "geneID", "GeneRatio", "BgRatio")
gp_mod$geneID = gsub(",", "/", gp_mod$geneID)

# define as enrichResult object
gp_mod_enrich  = new("enrichResult", result = gp_mod)

gp_mod_enrich@result <- gp_mod_enrich@result %>%
  mutate(generationumerico = Count / query_size) %>%
  arrange(desc(Cluster)) 

enrichplot::dotplot(gp_mod_enrich, showCategory = 20, x = "Cluster", size = "GeneRatio", title = "GO 30 genes downregulated")

write.csv(gp_mod_enrich@result, "top30_genes_umap1pos_go.csv")

4.2 KEGG

gp_up_ordered = gost(list("Specific UMAP1+ genes" = important.genes.top$rowname, 
                          "Matriz genes (No UMAP1+ genes)" = colnames(matriz.AD.minus.important.genes)) , 
                     organism = "hsapiens",
                     sources = "KEGG", 
                     multi_query = T,
                     highlight = T)

gostplot(gp_up_ordered, interactive = T, capped = T)
top_genes_umap1pos.gprofiler = gconvert(important.genes.top$rowname)
top_genes_matriz.gprofiler = gconvert(colnames(matriz.AD.minus.important.genes))

# enrichment analysis using gene names
multi_gp = gost(list("Specific UMAP1+ genes" = top_genes_umap1pos.gprofiler$name,
                     "Matriz genes (No UMAP1+ genes)" = top_genes_matriz.gprofiler$name), 
          multi_query = FALSE, 
          evcodes = TRUE, 
          sources = "KEGG")

# modify the g:Profiler data frame
gp_mod = multi_gp$result[,c("query", "source", "term_id",
                                "term_name", "p_value", "query_size", 
                                "intersection_size", "term_size", 
                                "effective_domain_size", "intersection")]
gp_mod$GeneRatio = paste0(gp_mod$intersection_size,  "/", gp_mod$query_size)
gp_mod$BgRatio = paste0(gp_mod$term_size, "/", gp_mod$effective_domain_size)
names(gp_mod) = c("Cluster", "Category", "ID", "Description", "p.adjust", 
                    "query_size", "Count", "term_size", "effective_domain_size", 
                    "geneID", "GeneRatio", "BgRatio")
gp_mod$geneID = gsub(",", "/", gp_mod$geneID)

# define as enrichResult object
gp_mod_enrich  = new("enrichResult", result = gp_mod)

gp_mod_enrich@result <- gp_mod_enrich@result %>%
  mutate(generationumerico = Count / query_size) %>%
  arrange(desc(generationumerico)) 

enrichplot::dotplot(gp_mod_enrich, showCategory = 30, x = "Cluster", size = "GeneRatio", title = "KEGG 30 genes")

write.csv(gp_mod_enrich@result, "top30_genes_umap1pos_kegg.csv")

4.3 REAC

gp_up_ordered = gost(list("Specific UMAP1+ genes" = important.genes.top$rowname, 
                          "Matriz genes (No UMAP1+ genes)" = colnames(matriz.AD.minus.important.genes)) , 
                     organism = "hsapiens",
                     sources = "REAC", 
                     multi_query = T,
                     highlight = T)

gostplot(gp_up_ordered, interactive = T, capped = T)
top_genes_umap1pos.gprofiler = gconvert(important.genes.top$rowname)
top_genes_matriz.gprofiler = gconvert(colnames(matriz.AD.minus.important.genes))

# enrichment analysis using gene names
multi_gp = gost(list("Specific UMAP1+ genes" = top_genes_umap1pos.gprofiler$name,
                     "Matriz genes (No UMAP1+ genes)" = top_genes_matriz.gprofiler$name), 
          multi_query = FALSE, 
          evcodes = TRUE, 
          sources = "REAC")

# modify the g:Profiler data frame
gp_mod = multi_gp$result[,c("query", "source", "term_id",
                                "term_name", "p_value", "query_size", 
                                "intersection_size", "term_size", 
                                "effective_domain_size", "intersection")]
gp_mod$GeneRatio = paste0(gp_mod$intersection_size,  "/", gp_mod$query_size)
gp_mod$BgRatio = paste0(gp_mod$term_size, "/", gp_mod$effective_domain_size)
names(gp_mod) = c("Cluster", "Category", "ID", "Description", "p.adjust", 
                    "query_size", "Count", "term_size", "effective_domain_size", 
                    "geneID", "GeneRatio", "BgRatio")
gp_mod$geneID = gsub(",", "/", gp_mod$geneID)

# define as enrichResult object
gp_mod_enrich  = new("enrichResult", result = gp_mod)

gp_mod_enrich@result <- gp_mod_enrich@result %>%
  mutate(generationumerico = Count / query_size) %>%
  arrange(desc(Cluster)) 

enrichplot::dotplot(gp_mod_enrich, showCategory = 20, x = "Cluster", size = "GeneRatio", title = "REAC 30 genes")

write.csv(gp_mod_enrich@result, "top30_genes_umap1pos_reac.csv")

5 Top 40

5.1 GO

important.genes.top <- important.genes.umap1pos%>%
  head(., 40) 

selected_columns <- colnames(mat_exp_alz_genes.RF) %in% c(important.genes.top$rowname, "class")

df <- mat_exp_alz_genes.RF[, selected_columns]

df_long <- df %>%
  gather(key = "Gene", value = "Expression", -class)

df_long$Gene <- factor(df_long$Gene, levels = important.genes.top$rowname)

ggplot(df_long, aes(x = class, y = Expression, fill = class)) +
  geom_boxplot() +
  facet_wrap(~ Gene, scales = "free_y") +
  labs(title = "Boxplots Genes", x = "UMAP1pos", y = "Expression", fill = "mUMAP1pos") 

matriz.AD.minus.important.genes <- mat_exp_alz_genes.RF[,!colnames(mat_exp_alz_genes.RF) %in% important.genes.top$rowname]

top_genes <- important.genes.top$rowname

gp_up_ordered = gost(list("Specific UMAP1+ genes" = important.genes.top$rowname, 
                          "Matriz genes (No UMAP1+ genes)" = colnames(matriz.AD.minus.important.genes)) , 
                     organism = "hsapiens",
                     sources = "GO", 
                     multi_query = T,
                     highlight = T,
                     exclude_iea = T)

gostplot(gp_up_ordered, interactive = T, capped = T)
top_genes_umap1pos.gprofiler = gconvert(important.genes.top$rowname)
top_genes_matriz.gprofiler = gconvert(colnames(matriz.AD.minus.important.genes))

# enrichment analysis using gene names
multi_gp = gost(list("Specific UMAP1+ genes" = top_genes_umap1pos.gprofiler$name,
                     "Matriz genes (No UMAP1+ genes)" = top_genes_matriz.gprofiler$name), 
          multi_query = FALSE, 
          evcodes = TRUE, 
          sources = "GO", 
          highlight = T,
                     exclude_iea = T)

multi_gp$result <- multi_gp$result %>%
  filter(highlighted == T)

# modify the g:Profiler data frame
gp_mod = multi_gp$result[,c("query", "source", "term_id",
                                "term_name", "p_value", "query_size", 
                                "intersection_size", "term_size", 
                                "effective_domain_size", "intersection")]
gp_mod$GeneRatio = paste0(gp_mod$intersection_size,  "/", gp_mod$query_size)
gp_mod$BgRatio = paste0(gp_mod$term_size, "/", gp_mod$effective_domain_size)
names(gp_mod) = c("Cluster", "Category", "ID", "Description", "p.adjust", 
                    "query_size", "Count", "term_size", "effective_domain_size", 
                    "geneID", "GeneRatio", "BgRatio")
gp_mod$geneID = gsub(",", "/", gp_mod$geneID)

# define as enrichResult object
gp_mod_enrich  = new("enrichResult", result = gp_mod)

gp_mod_enrich@result <- gp_mod_enrich@result %>%
  mutate(generationumerico = Count / query_size) %>%
  arrange(desc(Cluster)) 

enrichplot::dotplot(gp_mod_enrich, showCategory = 20, x = "Cluster", size = "GeneRatio", title = "GO 40 genes")

write.csv(gp_mod_enrich@result, "top40_genes_umap1pos_go.csv")

5.2 KEGG

gp_up_ordered = gost(list("Specific UMAP1+ genes" = important.genes.top$rowname, 
                          "Matriz genes (No UMAP1+ genes)" = colnames(matriz.AD.minus.important.genes)) , 
                     organism = "hsapiens",
                     sources = "KEGG", 
                     multi_query = T,
                     highlight = T)

gostplot(gp_up_ordered, interactive = T, capped = T)
top_genes_umap1pos.gprofiler = gconvert(important.genes.top$rowname)
top_genes_matriz.gprofiler = gconvert(colnames(matriz.AD.minus.important.genes))

# enrichment analysis using gene names
multi_gp = gost(list("Specific UMAP1+ genes" = top_genes_umap1pos.gprofiler$name,
                     "Matriz genes (No UMAP1+ genes)" = top_genes_matriz.gprofiler$name), 
          multi_query = FALSE, 
          evcodes = TRUE, 
          sources = "KEGG")

# modify the g:Profiler data frame
gp_mod = multi_gp$result[,c("query", "source", "term_id",
                                "term_name", "p_value", "query_size", 
                                "intersection_size", "term_size", 
                                "effective_domain_size", "intersection")]
gp_mod$GeneRatio = paste0(gp_mod$intersection_size,  "/", gp_mod$query_size)
gp_mod$BgRatio = paste0(gp_mod$term_size, "/", gp_mod$effective_domain_size)
names(gp_mod) = c("Cluster", "Category", "ID", "Description", "p.adjust", 
                    "query_size", "Count", "term_size", "effective_domain_size", 
                    "geneID", "GeneRatio", "BgRatio")
gp_mod$geneID = gsub(",", "/", gp_mod$geneID)

# define as enrichResult object
gp_mod_enrich  = new("enrichResult", result = gp_mod)

gp_mod_enrich@result <- gp_mod_enrich@result %>%
  mutate(generationumerico = Count / query_size) %>%
  arrange(desc(generationumerico)) 

enrichplot::dotplot(gp_mod_enrich, showCategory = 30, x = "Cluster", size = "GeneRatio", title = "KEGG 40 genes")

write.csv(gp_mod_enrich@result, "top40_genes_umap1pos_kegg.csv")

5.3 REAC

gp_up_ordered = gost(list("Specific UMAP1+ genes" = important.genes.top$rowname, 
                          "Matriz genes (No UMAP1+ genes)" = colnames(matriz.AD.minus.important.genes)) , 
                     organism = "hsapiens",
                     sources = "REAC", 
                     multi_query = T,
                     highlight = T)

gostplot(gp_up_ordered, interactive = T, capped = T)
top_genes_umap1pos.gprofiler = gconvert(important.genes.top$rowname)
top_genes_matriz.gprofiler = gconvert(colnames(matriz.AD.minus.important.genes))

# enrichment analysis using gene names
multi_gp = gost(list("Specific UMAP1+ genes" = top_genes_umap1pos.gprofiler$name,
                     "Matriz genes (No UMAP1+ genes)" = top_genes_matriz.gprofiler$name), 
          multi_query = FALSE, 
          evcodes = TRUE, 
          sources = "REAC")

# modify the g:Profiler data frame
gp_mod = multi_gp$result[,c("query", "source", "term_id",
                                "term_name", "p_value", "query_size", 
                                "intersection_size", "term_size", 
                                "effective_domain_size", "intersection")]
gp_mod$GeneRatio = paste0(gp_mod$intersection_size,  "/", gp_mod$query_size)
gp_mod$BgRatio = paste0(gp_mod$term_size, "/", gp_mod$effective_domain_size)
names(gp_mod) = c("Cluster", "Category", "ID", "Description", "p.adjust", 
                    "query_size", "Count", "term_size", "effective_domain_size", 
                    "geneID", "GeneRatio", "BgRatio")
gp_mod$geneID = gsub(",", "/", gp_mod$geneID)

# define as enrichResult object
gp_mod_enrich  = new("enrichResult", result = gp_mod)

gp_mod_enrich@result <- gp_mod_enrich@result %>%
  mutate(generationumerico = Count / query_size) %>%
  arrange(desc(Cluster)) 

enrichplot::dotplot(gp_mod_enrich, showCategory = 20, x = "Cluster", size = "GeneRatio", title = "REAC 40 genes")

write.csv(gp_mod_enrich@result, "top40_genes_umap1pos_reac.csv")